Analytics Project 2025: Self-Checkout Fraud

Data Understanding and Exploration

Author

Philipp Altenbach, Ronny Grieder, Omar Rahiel, Emre Yelögrü

Published

March 2, 2025

This document presents an initial exploratory analysis of the data set related to self-checkout fraud detection. The focus is on understanding the structure of the fraud.csv data set before proceeding with further analysis.

Loading and Inspection of the Data

Code
##| echo: true #This can be added to selectively show specific code chunks.
# Load dataset
data_path <- file.path(dirname(dirname(here())), "Data", "fraud.csv")
df <- fread(data_path)

Dimensions of the Dataset

Code
# Dimensions of the dataset
cat("The dataset contains", num_rows, "rows and", num_cols, "columns.\n")
The dataset contains 498121 rows and 10 columns.

Summary of the whole Dataset

Code
skim(df)
Data summary
Name df
Number of rows 498121
Number of columns 10
Key NULL
_______________________
Column type frequency:
numeric 10
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
trustLevel 0 1 3.50 1.71 1 2.00 4.00 5.00 6.00 ▇▅▅▅▅
totalScanTimeInSeconds 0 1 915.61 528.77 1 458.00 916.00 1374.00 1831.00 ▇▇▇▇▇
grandTotal 0 1 49.99 28.87 0 24.93 50.03 75.02 99.99 ▇▇▇▇▇
lineItemVoids 0 1 5.50 3.45 0 3.00 5.00 8.00 11.00 ▇▆▆▅▇
scansWithoutRegistration 0 1 5.00 3.16 0 2.00 5.00 8.00 10.00 ▇▅▅▅▆
quantityModifications 0 1 2.50 1.71 0 1.00 2.00 4.00 5.00 ▇▃▃▅▃
scannedLineItemsPerSecond 0 1 0.07 0.52 0 0.01 0.02 0.03 30.00 ▇▁▁▁▁
valuePerSecond 0 1 0.22 1.72 0 0.03 0.05 0.11 99.71 ▇▁▁▁▁
lineItemVoidsPerPosition 0 1 0.74 1.32 0 0.16 0.35 0.69 11.00 ▇▁▁▁▁
fraud 0 1 0.05 0.21 0 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Underlying Data Types

Code
cat(paste(names(df), "➡️", sapply(df, class)), sep = "\n")
trustLevel ➡️ integer
totalScanTimeInSeconds ➡️ integer
grandTotal ➡️ numeric
lineItemVoids ➡️ integer
scansWithoutRegistration ➡️ integer
quantityModifications ➡️ integer
scannedLineItemsPerSecond ➡️ numeric
valuePerSecond ➡️ numeric
lineItemVoidsPerPosition ➡️ numeric
fraud ➡️ integer

Checking for missing values and duplicates

Code
cat("The dataset contains", sum(missing_values), "missing values and ", duplicate_count, " duplicates.\n")
The dataset contains 0 missing values and  0  duplicates.

Correlation Matrix

Code
#Creation of correlation matrix
cor_matrix <- cor(df, use = "complete.obs") #Handle missing values if present.

#Visualization using corrplot
corrplot(cor_matrix, method = "color", col = COL1("YlOrRd"),
         tl.col = "grey30", tl.srt = 45, tl.cex = 0.675 ,addCoef.col = "white", 
         number.cex = 0.8, addgrid.col = "white")

Data Exploration of all features

Distribution of the target variable fraud

Code
# Create the bar chart with count labels
t_hist <- ggplot(df, aes(x = factor(fraud))) +  
  geom_bar(fill = "steelblue", alpha = 0.7) +  
  geom_text(stat = "count", aes(label =  after_stat(count)), size = 5) +  
  labs(title = "Distribution of Variable \"Fraud\"",
       x = "Fraud",
       y = "Count") +
  theme_minimal()

ggplotly(t_hist)  # Display static plot

Various explorations of the remaining features with Fraud Breakdown

  1. A breakdown of fraudulent transaction per trustLevel
Code
# Create the stacked bar chart
t_hist <- ggplot(df, aes(x = factor(trustLevel), fill = factor(fraud))) +  
  geom_bar(position = "stack", alpha = 0.7) +  
  labs(title = "Trust Level Distribution with Fraud Breakdown",
       x = "Trust Level",
       y = "Count",
       fill = "Fraud Status") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) + # Custom colors
  theme_minimal()

# Interactive version
ggplotly(t_hist)


  1. Insights into totalScanTimeInSeconds with Fraud Breakdown
Code
# Boxplot for totalScanTimeInSeconds by fraud status
boxplot_scan_time <- ggplot(df, aes(x = factor(fraud), y = totalScanTimeInSeconds, fill = factor(fraud))) +
  geom_boxplot(alpha = 0.7, outlier.color = "red") +  
  labs(title = "Total Scan Time by Fraud Status",
       x = "Fraud Status",
       y = "Total Scan Time (Seconds)",
       fill = "Fraud Status") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  
  theme_minimal()

ggplotly(boxplot_scan_time)  # Interactive version
Code
# Density plot for totalScanTimeInSeconds by fraud status
density_scan_time <- ggplot(df, aes(x = totalScanTimeInSeconds, fill = factor(fraud))) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Total Scan Time by Fraud Status",
       x = "Total Scan Time (Seconds)",
       y = "Density",
       fill = "Fraud Status") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +
  theme_minimal()

ggplotly(density_scan_time)  # Interactive version


  1. Insides into grandTotal with Fraud Breakdown
Code
# Boxplot for grandTotal by fraud status
boxplot_grand_total <- ggplot(df, aes(x = factor(fraud), y = grandTotal, fill = factor(fraud))) +
  geom_boxplot(alpha = 0.7, outlier.color = "red") +  
  labs(title = "Grand Total by Fraud Status",
       x = "Fraud Status",
       y = "Transaction Amount (Grand Total)",
       fill = "Fraud Status") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  
  theme_minimal()

ggplotly(boxplot_grand_total)  # Interactive version
Code
# Stacked Histogram of grandTotal by fraud status
hist_grand_total <- ggplot(df, aes(x = grandTotal, fill = factor(fraud))) +
  geom_histogram(bins = 30, color = "black", alpha = 0.7, position = "stack") +  
  labs(title = "Distribution of Grand Total with Fraud Breakdown",
       x = "Transaction Amount (Grand Total)",
       y = "Count",
       fill = "Fraud Status") +  
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  # Custom colors
  theme_minimal()

ggplotly(hist_grand_total)  # Interactive version


  1. Insides into lineItemVoids with Fraud Breakdown
Code
# Boxplot for lineItemVoids by fraud status
boxplot_lineItemVoids <- ggplot(df, aes(x = factor(fraud), y = lineItemVoids, fill = factor(fraud))) +
  geom_boxplot(alpha = 0.7, outlier.color = "red") +  
  labs(title = "lineItemVoids by Fraud Status",
       x = "Fraud Status",
       y = "lineItemVoids",
       fill = "Fraud Status") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  
  theme_minimal()

ggplotly(boxplot_lineItemVoids)  # Interactive version
Code
# Stacked Histogram of lineItemVoids by fraud status
hist_lineItemVoids <- ggplot(df, aes(x = lineItemVoids, fill = factor(fraud))) +
  geom_histogram(bins = 10, color = "black", alpha = 0.7, position = "stack") +  
  labs(title = "Distribution of lineItemVoids with Fraud Breakdown",
       x = "lineItemVoids",
       y = "Count",
       fill = "Fraud Status") +  
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  # Custom colors
  theme_minimal()

ggplotly(hist_lineItemVoids)  # Interactive version


  1. Insides into scansWithoutRegistration with Fraud Breakdown
Code
# Boxplot for scansWithoutRegistration by fraud status
boxplot_scansWithoutRegistration <- ggplot(df, aes(x = factor(fraud), y = scansWithoutRegistration, fill = factor(fraud))) +
  geom_boxplot(alpha = 0.7, outlier.color = "red") +  
  labs(title = "scansWithoutRegistration by Fraud Status",
       x = "Fraud Status",
       y = "scansWithoutRegistration",
       fill = "Fraud Status") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  
  theme_minimal()

ggplotly(boxplot_scansWithoutRegistration)  # Interactive version
Code
# Stacked Histogram of scansWithoutRegistration by Fraud Status
hist_scansWithoutRegistration <- ggplot(df, aes(x = scansWithoutRegistration, fill = factor(fraud))) +
  geom_histogram(bins = 10, color = "black", alpha = 0.7, position = "stack") +  
  labs(title = "Distribution of scansWithoutRegistration with Fraud Breakdown",
       x = "scansWithoutRegistration",
       y = "Count",
       fill = "Fraud Status") +  
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  # Custom colors
  theme_minimal()

ggplotly(hist_scansWithoutRegistration)  # Interactive version


  1. Insides into quantityModifications with Fraud Breakdown
Code
# Boxplot for quantityModifications by fraud status
boxplot_quantityModifications <- ggplot(df, aes(x = factor(fraud), y = quantityModifications, fill = factor(fraud))) +
  geom_boxplot(alpha = 0.7, outlier.color = "red") +  
  labs(title = "quantityModifications by Fraud Status",
       x = "Fraud Status",
       y = "quantityModifications",
       fill = "Fraud Status") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  
  theme_minimal()

ggplotly(boxplot_quantityModifications)  # Interactive version
Code
# Stacked Histogram of quantityModifications by Fraud Status
hist_quantityModifications <- ggplot(df, aes(x = quantityModifications, fill = factor(fraud))) +
  geom_histogram(bins = 5, color = "black", alpha = 0.7, position = "stack") +  
  labs(title = "Distribution of quantityModifications with Fraud Breakdown",
       x = "quantityModifications",
       y = "Count",
       fill = "Fraud Status") +  
  scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +  # Custom colors
  theme_minimal()

ggplotly(hist_quantityModifications)  # Interactive version